library(here)
library(tidyverse)
library(conflicted)
# library(easystats)
exoplanets <- read_csv(here("data", "exoplanet_catalog_080325.csv"))
Warning: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
dat <- vroom(...)
problems(dat)Rows: 7418 Columns: 98── Column specification ────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (12): name, planet_status, publication, detection_type, mass_measurement_type, radius_measurement_type,...
dbl (83): mass, mass_error_min, mass_error_max, mass_sini, mass_sini_error_min, mass_sini_error_max, radius...
lgl (2): hot_point_lon, star_magnetic_field
date (1): updated
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
exoplanets
library(skimr)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
skim(exoplanets)
Warning: There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names), mangled_skimmers$funs)`.
ℹ In group 0: .
Caused by warning:
! There was 1 warning in `dplyr::summarize()`.
ℹ In argument: `dplyr::across(tidyselect::any_of(variable_names), mangled_skimmers$funs)`.
Caused by warning in `inline_hist()`:
! Variable contains Inf or -Inf value(s) that were converted to NA.
── Data Summary ────────────────────────
Values
Name exoplanets
Number of rows 7418
Number of columns 98
_______________________
Column type frequency:
character 12
Date 1
logical 2
numeric 83
________________________
Group variables None
library(naniar)
# options(repr.plot.width = 10, repr.plot.height = 20)
gg_miss_var(exoplanets)
library(visdat)
# options(repr.plot.width = 20, repr.plot.height = 10)
vis_dat(exoplanets)
names(exoplanets)
[1] "name" "planet_status" "mass"
[4] "mass_error_min" "mass_error_max" "mass_sini"
[7] "mass_sini_error_min" "mass_sini_error_max" "radius"
[10] "radius_error_min" "radius_error_max" "orbital_period"
[13] "orbital_period_error_min" "orbital_period_error_max" "semi_major_axis"
[16] "semi_major_axis_error_min" "semi_major_axis_error_max" "eccentricity"
[19] "eccentricity_error_min" "eccentricity_error_max" "inclination"
[22] "inclination_error_min" "inclination_error_max" "angular_distance"
[25] "discovered" "updated" "omega"
[28] "omega_error_min" "omega_error_max" "tperi"
[31] "tperi_error_min" "tperi_error_max" "tconj"
[34] "tconj_error_min" "tconj_error_max" "tzero_tr"
[37] "tzero_tr_error_min" "tzero_tr_error_max" "tzero_tr_sec"
[40] "tzero_tr_sec_error_min" "tzero_tr_sec_error_max" "lambda_angle"
[43] "lambda_angle_error_min" "lambda_angle_error_max" "impact_parameter"
[46] "impact_parameter_error_min" "impact_parameter_error_max" "tzero_vr"
[49] "tzero_vr_error_min" "tzero_vr_error_max" "k"
[52] "k_error_min" "k_error_max" "temp_calculated"
[55] "temp_calculated_error_min" "temp_calculated_error_max" "temp_measured"
[58] "hot_point_lon" "geometric_albedo" "geometric_albedo_error_min"
[61] "geometric_albedo_error_max" "log_g" "publication"
[64] "detection_type" "mass_measurement_type" "radius_measurement_type"
[67] "alternate_names" "molecules" "star_name"
[70] "ra" "dec" "mag_v"
[73] "mag_i" "mag_j" "mag_h"
[76] "mag_k" "star_distance" "star_distance_error_min"
[79] "star_distance_error_max" "star_metallicity" "star_metallicity_error_min"
[82] "star_metallicity_error_max" "star_mass" "star_mass_error_min"
[85] "star_mass_error_max" "star_radius" "star_radius_error_min"
[88] "star_radius_error_max" "star_sp_type" "star_age"
[91] "star_age_error_min" "star_age_error_max" "star_teff"
[94] "star_teff_error_min" "star_teff_error_max" "star_detected_disc"
[97] "star_magnetic_field" "star_alternate_names"
library(janitor)
exoplanets %>% tabyl(planet_status)
planet_status n percent
Confirmed 7418 1
library(data.table)
# options(repr.matrix.max.rows=100)
exoplanets %>%
add_prop_miss() %>%
arrange(prop_miss_all) %>%
head(5) %>%
data.table::transpose(keep.names="column") -> preview
preview
preview %>% View()
We have a lot of features: - Planet name - Mass (M jup) - Mass*sin(i) (M jup) - This describes minimum mass of the planet due to inclination effect
exoplanets %>%
filter(name %>% str_like("%TOI-784%"))
conflicts_prefer(dplyr::filter)
[conflicted] Removing existing preference.[conflicted] Will prefer dplyr::filter over any other package.
exoplanets %>%
filter(discovered == 2023)
# remove any column with error in the name
exoplanets %>%
select(-contains("error")) %>%
select(-planet_status, -updated)
exoplanets %>%
tabyl("detection_type")
detection_type n percent
Astrometry 46 0.0062011324
Astrometry, Imaging 1 0.0001348072
Astrometry, Radial Velocity 3 0.0004044217
Imaging 922 0.1242922621
Imaging, Astrometry 49 0.0066055541
Imaging, Kinematic 2 0.0002696145
Imaging, Other 46 0.0062011324
Imaging, Other, Astrometry 1 0.0001348072
Imaging, Other, Kinematic 3 0.0004044217
Imaging, Primary Transit 1 0.0001348072
Imaging, Radial Velocity, Astrometry 1 0.0001348072
Kinematic 2 0.0002696145
Microlensing 313 0.0421946616
Other 42 0.0056619035
Other, Imaging 1 0.0001348072
Other, Imaging, Kinematic 1 0.0001348072
Other, Radial Velocity 1 0.0001348072
Primary Transit 4509 0.6078457805
Primary Transit, Astrometry 1 0.0001348072
Primary Transit, Kinematic 1 0.0001348072
Primary Transit, Radial Velocity 7 0.0009436506
Primary Transit, TTV 2 0.0002696145
Radial Velocity 1145 0.1543542734
Radial Velocity, Astrometry 99 0.0133459153
Radial Velocity, Imaging 2 0.0002696145
Radial Velocity, Primary Transit 7 0.0009436506
Radial Velocity, Timing 1 0.0001348072
TTV 32 0.0043138312
Timing 160 0.0215691561
Timing, Astrometry 1 0.0001348072
Timing, Kinematic 10 0.0013480723
Timing, Other 6 0.0008088434
exoplanets